static const char* narrowphaseKernelsCL= \
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"\n"
"#define max2 max\n"
"#define min2 min\n"
"\n"
"\n"
"///////////////////////////////////////\n"
"//	Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float fastDiv(float numerator, float denominator)\n"
"{\n"
"	return native_divide(numerator, denominator);	\n"
"//	return numerator/denominator;	\n"
"}\n"
"\n"
"__inline\n"
"float4 fastDiv4(float4 numerator, float4 denominator)\n"
"{\n"
"	return native_divide(numerator, denominator);	\n"
"}\n"
"\n"
"__inline\n"
"float fastSqrtf(float f2)\n"
"{\n"
"	return native_sqrt(f2);\n"
"//	return sqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastRSqrt(float f2)\n"
"{\n"
"	return native_rsqrt(f2);\n"
"}\n"
"\n"
"__inline\n"
"float fastLength4(float4 v)\n"
"{\n"
"	return fast_length(v);\n"
"}\n"
"\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
"	return fast_normalize(v);\n"
"}\n"
"\n"
"\n"
"__inline\n"
"float sqrtf(float a)\n"
"{\n"
"//	return sqrt(a);\n"
"	return native_sqrt(a);\n"
"}\n"
"\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
"	return cross(a,b);\n"
"}\n"
"\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
"	float4 a1 = make_float4(a.xyz,0.f);\n"
"	float4 b1 = make_float4(b.xyz,0.f);\n"
"	return dot(a1, b1);\n"
"}\n"
"\n"
"__inline\n"
"float length3(const float4 a)\n"
"{\n"
"	return sqrtf(dot3F4(a,a));\n"
"}\n"
"\n"
"__inline\n"
"float dot4(const float4 a, const float4 b)\n"
"{\n"
"	return dot( a, b );\n"
"}\n"
"\n"
"//	for height\n"
"__inline\n"
"float dot3w1(const float4 point, const float4 eqn)\n"
"{\n"
"	return dot3F4(point,eqn) + eqn.w;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
"	return fastNormalize4( n );\n"
"//	float length = sqrtf(dot3F4(a, a));\n"
"//	return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 normalize4(const float4 a)\n"
"{\n"
"	float length = sqrtf(dot4(a, a));\n"
"	return 1.f/length * a;\n"
"}\n"
"\n"
"__inline\n"
"float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
"{\n"
"	float4 eqn;\n"
"	float4 ab = b-a;\n"
"	float4 ac = c-a;\n"
"	eqn = normalize3( cross3(ab, ac) );\n"
"	eqn.w = -dot3F4(eqn,a);\n"
"	return eqn;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"//	Matrix3x3\n"
"///////////////////////////////////////\n"
"\n"
"typedef struct\n"
"{\n"
"	float4 m_row[3];\n"
"}Matrix3x3;\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero();\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity();\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"\n"
"__inline\n"
"Matrix3x3 mtZero()\n"
"{\n"
"	Matrix3x3 m;\n"
"	m.m_row[0] = (float4)(0.f);\n"
"	m.m_row[1] = (float4)(0.f);\n"
"	m.m_row[2] = (float4)(0.f);\n"
"	return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtIdentity()\n"
"{\n"
"	Matrix3x3 m;\n"
"	m.m_row[0] = (float4)(1,0,0,0);\n"
"	m.m_row[1] = (float4)(0,1,0,0);\n"
"	m.m_row[2] = (float4)(0,0,1,0);\n"
"	return m;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
"{\n"
"	Matrix3x3 out;\n"
"	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
"	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
"	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
"	return out;\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
"{\n"
"	Matrix3x3 transB;\n"
"	transB = mtTranspose( b );\n"
"	Matrix3x3 ans;\n"
"	//	why this doesn't run when 0ing in the for{}\n"
"	a.m_row[0].w = 0.f;\n"
"	a.m_row[1].w = 0.f;\n"
"	a.m_row[2].w = 0.f;\n"
"	for(int i=0; i<3; i++)\n"
"	{\n"
"//	a.m_row[i].w = 0.f;\n"
"		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
"		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
"		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
"		ans.m_row[i].w = 0.f;\n"
"	}\n"
"	return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
"	float4 ans;\n"
"	ans.x = dot3F4( a.m_row[0], b );\n"
"	ans.y = dot3F4( a.m_row[1], b );\n"
"	ans.z = dot3F4( a.m_row[2], b );\n"
"	ans.w = 0.f;\n"
"	return ans;\n"
"}\n"
"\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
"	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
"	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
"	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
"\n"
"	float4 ans;\n"
"	ans.x = dot3F4( a, colx );\n"
"	ans.y = dot3F4( a, coly );\n"
"	ans.z = dot3F4( a, colz );\n"
"	return ans;\n"
"}\n"
"\n"
"///////////////////////////////////////\n"
"//	Quaternion\n"
"///////////////////////////////////////\n"
"\n"
"typedef float4 Quaternion;\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b);\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in);\n"
"\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec);\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q);\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(Quaternion q);\n"
"\n"
"\n"
"\n"
"__inline\n"
"Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
"	Quaternion ans;\n"
"	ans = cross3( a, b );\n"
"	ans += a.w*b+b.w*a;\n"
"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
"	ans.w = a.w*b.w - dot3F4(a, b);\n"
"	return ans;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtNormalize(Quaternion in)\n"
"{\n"
"	return fastNormalize4(in);\n"
"//	in /= length( in );\n"
"//	return in;\n"
"}\n"
"__inline\n"
"float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
"	Quaternion qInv = qtInvert( q );\n"
"	float4 vcpy = vec;\n"
"	vcpy.w = 0.f;\n"
"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
"	return out;\n"
"}\n"
"\n"
"__inline\n"
"Quaternion qtInvert(Quaternion q)\n"
"{\n"
"	return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"\n"
"__inline\n"
"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
"	return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"\n"
"__inline\n"
"Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n"
"{\n"
"	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
"	Matrix3x3 out;\n"
"\n"
"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
"	out.m_row[0].w = 0.f;\n"
"\n"
"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
"	out.m_row[1].w = 0.f;\n"
"\n"
"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
"	out.m_row[2].w = 0.f;\n"
"\n"
"	return out;\n"
"}\n"
"\n"
"\n"
"#define WG_SIZE 64\n"
"#define HEIGHT_RES 4\n"
"#define SHAPE_CONVEX_HEIGHT_FIELD 1//keep this in sync with AdlCollisionShape.h!\n"
"\n"
"typedef struct\n"
"{\n"
"	float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];\n"
"	u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];\n"
"	u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];\n"
"\n"
"	float m_scale;\n"
"	float m_padding0;\n"
"	float m_padding1;\n"
"	float m_padding2;\n"
"} ShapeData;\n"
"\n"
"typedef struct\n"
"{\n"
"	u32 m_height4[HEIGHT_RES*HEIGHT_RES*6/4];\n"
"\n"
"	float m_scale;\n"
"} ShapeDeviceData;\n"
"\n"
"typedef struct\n"
"{\n"
"	float4 m_pos;\n"
"	float4 m_quat;\n"
"	float4 m_linVel;\n"
"	float4 m_angVel;\n"
"\n"
"	u32 m_shapeIdx;\n"
"	u32 m_shapeType;\n"
"	\n"
"	float m_invMass;\n"
"	float m_restituitionCoeff;\n"
"	float m_frictionCoeff;\n"
"} BodyData;\n"
"\n"
"typedef struct\n"
"{\n"
"	float4 m_worldPos[4];\n"
"	float4 m_worldNormal;	//	w: m_nPoints\n"
"//	float m_restituitionCoeff;\n"
"//	float m_frictionCoeff;\n"
"	u32 m_coeffs;\n"
"	u32 m_batchIdx;\n"
"//	int m_nPoints;\n"
"//	int m_padding0;\n"
"\n"
"	u32 m_bodyAPtr;//x:m_bodyAPtr, y:m_bodyBPtr\n"
"	u32 m_bodyBPtr;\n"
"} Contact4;\n"
"\n"
"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
"	int m_nPairs;\n"
"	float m_collisionMargin;\n"
"	int m_capacity;\n"
"	int m_paddings[1];\n"
"} ConstBuffer;\n"
"\n"
"__inline\n"
"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
"{\n"
"	return qtRotate( *orientation, *p ) + (*translation);\n"
"}\n"
"\n"
"__inline\n"
"float4 invTransform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
"{\n"
"	return qtRotate( qtInvert( *orientation ), (*p)-(*translation) ); // use qtInvRotate\n"
"}\n"
"\n"
"void CubeMapUtilsCalcCrd(const float4 p, int* faceIdxOut, float* x, float* y)\n"
"{\n"
"	{\n"
"		int idx;\n"
"		float r2[] = {p.x*p.x, p.y*p.y, p.z*p.z};\n"
"\n"
"		if (r2[1]>r2[0])\n"
"		{\n"
"			if (r2[2]>r2[1])\n"
"			{\n"
"				idx = 2;\n"
"			\n"
"			} else\n"
"			{\n"
"				idx = 1;\n"
"			}\n"
"		\n"
"		} else\n"
"		{\n"
"			if (r2[2]>r2[0])\n"
"			{\n"
"				idx = 2;\n"
"			} else\n"
"			{\n"
"				idx = 0;\n"
"			}\n"
"		}\n"
"\n"
"		*faceIdxOut = (idx*2);\n"
"//==\n"
"		float4 abs = make_float4( fabs(p.x), fabs(p.y), fabs(p.z), 0.f );\n"
"\n"
"		float d;\n"
"		if( idx == 0 )\n"
"		{\n"
"			*x = p.y;\n"
"			*y = p.z;\n"
"			d = abs.x;\n"
"			*faceIdxOut += (p.x < 0.f)? 0: 1.f;\n"
"		}\n"
"		else if( idx == 1 )\n"
"		{\n"
"			*x = p.z;\n"
"			*y = p.x;\n"
"			d = abs.y;\n"
"			*faceIdxOut += (p.y < 0.f)? 0: 1.f;\n"
"		}\n"
"		else\n"
"		{\n"
"			*x = p.x;\n"
"			*y = p.y;\n"
"			d = abs.z;\n"
"			*faceIdxOut += (p.z < 0.f)? 0: 1.f;\n"
"		}\n"
"\n"
"		float dInv = (d==0.f)? 0.f: fastDiv(1.f,d);\n"
"		*x = (*x*dInv+1.f)*0.5f;\n"
"		*y = (*y*dInv+1.f)*0.5f;\n"
"	}\n"
"}\n"
"\n"
"float4 CubeMapUtilsCalcVector(int faceIdx, float x, float y)\n"
"{\n"
"	int dir = faceIdx/2;\n"
"	float z = (faceIdx%2 == 0)? -1.f:1.f;\n"
"\n"
"	x = x*2.f-1.f;\n"
"	y = y*2.f-1.f;\n"
"	\n"
"	if( dir == 0 )\n"
"	{\n"
"		return make_float4(z, x, y, 0.f);\n"
"	}\n"
"	else if( dir == 1 )\n"
"	{\n"
"		return make_float4(y,z,x, 0.f);\n"
"	}\n"
"	else\n"
"	{\n"
"		return make_float4(x,y,z, 0.f);\n"
"	}\n"
"}\n"
"\n"
"typedef int Face;\n"
"\n"
"u32 sample(__local ShapeDeviceData* shape, int face, int x, int y)\n"
"{\n"
"\n"
"	int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;\n"
"	__local u8* height = (__local u8*)shape->m_height4;\n"
"	return height[idx];\n"
"}\n"
"\n"
"u32 sampleSupportGlobal(__global ShapeData* shape, int face, int x, int y)\n"
"{\n"
"\n"
"	int idx = HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES;\n"
"	__global u8* height = (__global u8*)shape->m_supportHeight4;\n"
"	return height[idx];\n"
"}\n"
"\n"
"float4 sampleNormal(__local ShapeData* shape, int face, int x, int y)\n"
"{\n"
"	return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];\n"
"}\n"
"\n"
"float4 sampleNormalGlobal(const __global ShapeData* shape, int face, int x, int y)\n"
"{\n"
"	return shape->m_normal[HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES];\n"
"}\n"
"\n"
"float4 ShapeDataCalcSamplePoint( __local const ShapeDeviceData* shape, int sIdx )//u8 height, int sIdx, float scale )\n"
"{\n"
"	const float oneOver255 = 1.f/255.f;\n"
"\n"
"	int faceIdx = fastDiv(sIdx,(HEIGHT_RES*HEIGHT_RES));\n"
"	int r = (sIdx%(HEIGHT_RES*HEIGHT_RES));\n"
"	int i = r/HEIGHT_RES;\n"
"	int j = r%HEIGHT_RES;\n"
"\n"
"	float4 v;\n"
"	float x = fastDiv((i+0.5f),(float)HEIGHT_RES);\n"
"	float y = fastDiv((j+0.5f),(float)HEIGHT_RES);\n"
"	v = CubeMapUtilsCalcVector(faceIdx, x, y);\n"
"	v = normalize3( v );\n"
"\n"
"	int quantizedHeight = sample( shape, faceIdx, i, j );\n"
"	float rheight = quantizedHeight*oneOver255*shape->m_scale;\n"
"	return rheight*v;\n"
"}\n"
"\n"
"float ShapeDataQueryDistance(__local const ShapeDeviceData* shape, float4 p )\n"
"{\n"
"	if( dot3F4( p, p ) >= shape->m_scale*shape->m_scale ) return FLT_MAX;\n"
"\n"
"	const float oneOver255 = 1.f/255.f;\n"
"\n"
"	int faceIdx;\n"
"	float x, y;\n"
"	CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n"
"	x = (x*HEIGHT_RES) - 0.5f;\n"
"	y = (y*HEIGHT_RES) - 0.5f;\n"
"\n"
"	float height;\n"
"	{\n"
"		int xi = (int)(x);\n"
"		int yi = (int)(y);\n"
"		float dx = x-xi;\n"
"		float dy = y-yi;\n"
"\n"
"		{\n"
"			int xip = min2((int)(HEIGHT_RES-1), xi+1);\n"
"			int yip = min2((int)(HEIGHT_RES-1), yi+1);\n"
"\n"
"			u32 xy = sample( shape, faceIdx, xi, yi );\n"
"			u32 xpy = sample( shape, faceIdx, xip, yi );\n"
"			u32 xpyp = sample( shape, faceIdx, xip, yip );\n"
"			u32 xyp = sample( shape, faceIdx, xi, yip );\n"
"\n"
"			height = (xy*(1.f-dx)+xpy*dx)*(1.f-dy) + (xyp*(1.f-dx)+xpyp*dx)*dy;\n"
"			height = height*oneOver255*shape->m_scale;\n"
"\n"
"			p.w = 0.f;\n"
"\n"
"			height = fastLength4( p ) - height;\n"
"		}\n"
"	}\n"
"\n"
"	return height;\n"
"}\n"
"\n"
"float ShapeDataQuerySupportHeight(__global ShapeData* shape, float4 p )\n"
"{\n"
"	int faceIdx;\n"
"	float x, y;\n"
"	CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n"
"	x = (x*HEIGHT_RES) - 0.5f;\n"
"	y = (y*HEIGHT_RES) - 0.5f;\n"
"\n"
"	float height;\n"
"	{\n"
"		int xi = (int)(x);\n"
"		int yi = (int)(y);\n"
"\n"
"		{\n"
"			int xip = min2((int)(HEIGHT_RES-1), xi+1);\n"
"			int yip = min2((int)(HEIGHT_RES-1), yi+1);\n"
"\n"
"			u32 xy = sampleSupportGlobal( shape, faceIdx, xi, yi );\n"
"			u32 xpy = sampleSupportGlobal( shape, faceIdx, xip, yi );\n"
"			u32 xpyp = sampleSupportGlobal( shape, faceIdx, xip, yip );\n"
"			u32 xyp = sampleSupportGlobal( shape, faceIdx, xi, yip );\n"
"\n"
"			height = max2( xy, max2( xpy, max2( xpyp, xyp ) ) );\n"
"			height = height/255.f*shape->m_scale;\n"
"		}\n"
"	}\n"
"\n"
"	return height;\n"
"\n"
"}\n"
"\n"
"float4 ShapeDataQueryNormal(__global const ShapeData* shape,  float4 p )\n"
"{\n"
"	int faceIdx;\n"
"	float x, y;\n"
"	CubeMapUtilsCalcCrd( p, &faceIdx, &x, &y );\n"
"	x = (x*HEIGHT_RES) - 0.5f;\n"
"	y = (y*HEIGHT_RES) - 0.5f;\n"
"\n"
"	float4 normalOut;\n"
"	{\n"
"		int xi = (int)(x);\n"
"		int yi = (int)(y);\n"
"\n"
"		normalOut = sampleNormalGlobal( shape, faceIdx, xi, yi );\n"
"	}\n"
"	return normalOut;\n"
"}\n"
"\n"
"\n"
"\n"
"//	kernels\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SupportCullingKernel( __global int2* restrict gPairsIn, __global ShapeData* gShapes, \n"
"		__global BodyData* gBodies, \n"
"		__global int2* gPairsOut, \n"
"		counter32_t gNPairs,\n"
"		ConstBuffer cb )\n"
"{\n"
"	int gIdx = GET_GLOBAL_IDX;\n"
"	if( gIdx >= cb.m_nPairs ) return;\n"
"\n"
"	const float collisionMargin = cb.m_collisionMargin;\n"
"	const int capacity = cb.m_capacity;\n"
"\n"
"	int2 pair = gPairsIn[gIdx];\n"
"	BodyData bodyA = gBodies[pair.x];\n"
"	BodyData bodyB = gBodies[pair.y];\n"
"	int shapeAIdx = bodyA.m_shapeIdx;\n"
"	int shapeBIdx = bodyB.m_shapeIdx;\n"
"\n"
"\n"
"	bool collide = false;\n"
"	\n"
"	//only collide if one of the two bodies has a non-zero mass\n"
"	if (bodyA.m_invMass==0.f && bodyB.m_invMass==0.f)\n"
"		return;\n"
"		\n"
"		\n"
"	if (bodyA.m_shapeType == SHAPE_CONVEX_HEIGHT_FIELD && bodyB.m_shapeType==SHAPE_CONVEX_HEIGHT_FIELD)\n"
"	{\n"
"		float4 abInA, baInB;\n"
"		float4 ab = bodyB.m_pos - bodyA.m_pos;\n"
"		{\n"
"			abInA = qtInvRotate( bodyA.m_quat, ab );\n"
"			baInB = qtInvRotate( bodyB.m_quat, -ab );\n"
"		}\n"
"		float hA = ShapeDataQuerySupportHeight( gShapes+shapeAIdx, abInA );\n"
"		float hB = ShapeDataQuerySupportHeight( gShapes+shapeBIdx, baInB );\n"
"\n"
"		float h2 = dot3F4( ab, ab );\n"
"\n"
"		collide = ( hA + hB + collisionMargin > sqrtf(h2) );\n"
"	}\n"
"\n"
"	if( collide )\n"
"	{\n"
"		int dstIdx;\n"
"		AppendInc( gNPairs, dstIdx );\n"
"		if( dstIdx < capacity )\n"
"			gPairsOut[dstIdx] = pair;\n"
"	}\n"
"}\n"
"\n"
"\n"
"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n"
"#define PARALLEL_REDUCE_MAX32(h) "
"	{int lIdx = GET_LOCAL_IDX;"
"	if( lIdx < 32 )"
"	{"
"		h[lIdx] = (h[lIdx].y > h[lIdx+1].y)? h[lIdx]: h[lIdx+1];"
"		mem_fence( CLK_LOCAL_MEM_FENCE );"
"		h[lIdx] = (h[lIdx].y > h[lIdx+2].y)? h[lIdx]: h[lIdx+2];"
"		mem_fence( CLK_LOCAL_MEM_FENCE );"
"		h[lIdx] = (h[lIdx].y > h[lIdx+4].y)? h[lIdx]: h[lIdx+4];"
"		mem_fence( CLK_LOCAL_MEM_FENCE );"
"		h[lIdx] = (h[lIdx].y > h[lIdx+8].y)? h[lIdx]: h[lIdx+8];"
"		mem_fence( CLK_LOCAL_MEM_FENCE );"
"		h[lIdx] = (h[lIdx].y > h[lIdx+16].y)? h[lIdx]: h[lIdx+16];"
"	}}\n"
"\n"
"#define PARALLEL_REDUCE32(h) "
"	{int lIdx = GET_LOCAL_IDX;"
"		if( lIdx < 32 )"
"		{"
"			h[lIdx] += h[lIdx+1];"
"			mem_fence( CLK_LOCAL_MEM_FENCE );"
"			h[lIdx] += h[lIdx+2];"
"			mem_fence( CLK_LOCAL_MEM_FENCE );"
"			h[lIdx] += h[lIdx+4];"
"			mem_fence( CLK_LOCAL_MEM_FENCE );"
"			h[lIdx] += h[lIdx+8];"
"			mem_fence( CLK_LOCAL_MEM_FENCE );"
"			h[lIdx] += h[lIdx+16];"
"		}}\n"
"\n"
"\n"
"float4 extractManifold(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 nearNormal)\n"
"{\n"
"	int nPoints = *nPointsPtr;\n"
"	float4 center = make_float4(0,0,0,0);\n"
"	{	//	calculate center\n"
"		nPoints = min2( nPoints, 32 );\n"
"		{\n"
"			int lIdx = GET_LOCAL_IDX;\n"
"			h[lIdx] = p[lIdx];\n"
"			h[lIdx] = (lIdx<nPoints)? h[lIdx] : make_float4(0,0,0,0);\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		PARALLEL_REDUCE32( h );//working on h[64]\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"//		if( GET_LOCAL_IDX == 0 )\n"
"		{\n"
"			center = fastDiv4( h[0], make_float4(nPoints, nPoints, nPoints, 0.f) );\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		if( nPoints < 4 ) return center;\n"
"	}\n"
"	//	is center set on all the WIs?\n"
"	float4 aVector = p[0] - center;\n"
"	float4 u = normalize3( cross3( nearNormal, aVector ) );\n"
"	float4 v = normalize3( cross3( nearNormal, u ) );\n"
"\n"
"	int idx[4];\n"
"\n"
"	__local int4* a = (__local int4*)h;\n"
"	{	//	select 4\n"
"		{	//	set dot of 4 directions for xyzw\n"
"			int ie = GET_LOCAL_IDX;\n"
"			{\n"
"				float f;\n"
"				float4 r = p[ie]-center;\n"
"				f = dot3F4( u, r );\n"
"				a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"				f = dot3F4( -u, r );\n"
"				a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"				f = dot3F4( v, r );\n"
"				a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"				f = dot3F4( -v, r );\n"
"				a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"				if( ie >= nPoints ) a[ie] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		{	//	vector reduce, h[64]\n"
"			int lIdx = GET_LOCAL_IDX;\n"
"			if( lIdx < 32 )\n"
"			{\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+1] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+2] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+4] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+8] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+16] );\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"	}\n"
"	{\n"
"		{	//	set to idx\n"
"			idx[0] = (int)a[0].x & 0xff;\n"
"			idx[1] = (int)a[0].y & 0xff;\n"
"			idx[2] = (int)a[0].z & 0xff;\n"
"			idx[3] = (int)a[0].w & 0xff;\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"		float4 selection;\n"
"		if( GET_LOCAL_IDX < 4 ) selection = p[idx[GET_LOCAL_IDX]];\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"		if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;\n"
"	}\n"
"\n"
"\n"
"	return center;\n"
"}\n"
"\n"
"void extractManifold1(__local float4* p, __local float4* h, __local int* nPointsPtr, float4 center)\n"
"{\n"
"	__local int* a = (__local int*)h;\n"
"	{\n"
"		GROUP_LDS_BARRIER;\n"
"		float4 selection;\n"
"		if( GET_LOCAL_IDX < 4 )\n"
"		{\n"
"			int idx = (int)a[GET_LOCAL_IDX] & 0xff;\n"
"			selection = p[idx];\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"		if( GET_LOCAL_IDX < 4 ) p[GET_LOCAL_IDX] = selection;\n"
"	}\n"
"\n"
"}\n"
"\n"
"void extractManifold2(	__local float4* p0, __local int* nPointsPtr0, float4 nearNormal0,\n"
"						__local float4* p1, __local int* nPointsPtr1, float4 nearNormal1,\n"
"						__local float4* h, float4 centerOut[2])\n"
"{\n"
"\n"
"	int nPoints[2];\n"
"	nPoints[0] = *nPointsPtr0;\n"
"	nPoints[1] = *nPointsPtr1;\n"
"	float4 center[2];\n"
"	center[0] = make_float4(0,0,0,0);\n"
"	center[1] = make_float4(0,0,0,0);\n"
"	{	//	calculate center\n"
"		nPoints[0] = min2( nPoints[0], 32 );\n"
"		nPoints[1] = min2( nPoints[1], 32 );\n"
"		{\n"
"			int lIdx = GET_LOCAL_IDX;\n"
"			h[lIdx] = (lIdx<nPoints[0])? p0[lIdx] : make_float4(0,0,0,0);\n"
"			h[lIdx+64] = (lIdx<nPoints[1])? p1[lIdx] : make_float4(0,0,0,0);\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		{\n"
"			int bIdx = GET_LOCAL_IDX/32;\n"
"			int eIdx = GET_LOCAL_IDX%32;\n"
"			int lIdx = eIdx + bIdx*64;\n"
"			{\n"
"				h[lIdx] += h[lIdx+1];\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] += h[lIdx+2];\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] += h[lIdx+4];\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] += h[lIdx+8];\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] += h[lIdx+16];\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		for(int bIdx=0; bIdx<2; bIdx++)\n"
"		{\n"
"			center[bIdx] = fastDiv4( h[bIdx*64], make_float4(nPoints[bIdx], nPoints[bIdx], nPoints[bIdx], 0.f) );\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"	}\n"
"\n"
"	centerOut[0] = center[0];\n"
"	centerOut[1] = center[1];\n"
"\n"
"	float4 u[2];\n"
"	float4 v[2];\n"
"\n"
"	{\n"
"		float4 aVector = p0[0] - center[0];\n"
"		u[0] = normalize3( cross3( nearNormal0, aVector ) );\n"
"		v[0] = normalize3( cross3( nearNormal0, u[0] ) );\n"
"	}\n"
"	{\n"
"		float4 aVector = p1[0] - center[1];\n"
"		u[1] = normalize3( cross3( nearNormal1, aVector ) );\n"
"		v[1] = normalize3( cross3( nearNormal1, u[1] ) );\n"
"	}\n"
"\n"
"	{\n"
"		__local int4* a = (__local int4*)h;\n"
"		{	//	select 4\n"
"			{	//	set dot of 4 directions for xyzw\n"
"				int ie = GET_LOCAL_IDX%32;\n"
"				int setIdx = GET_LOCAL_IDX/32;\n"
"				{\n"
"					float f;\n"
"					float4 r = p0[ie + setIdx*32]-center[setIdx];\n"
"					f = dot3F4( u[setIdx], r );\n"
"					a[ie + setIdx*64].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"					f = dot3F4( -u[setIdx], r );\n"
"					a[ie + setIdx*64].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"					f = dot3F4( v[setIdx], r );\n"
"					a[ie + setIdx*64].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"					f = dot3F4( -v[setIdx], r );\n"
"					a[ie + setIdx*64].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
"\n"
"					if( ie >= nPoints[setIdx] ) a[ie + setIdx*64] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n"
"\n"
"					a[ie + 32] = make_int4(-0xfffffff, -0xfffffff, -0xfffffff, -0xfffffff);\n"
"				}\n"
"			}\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		{	//	vector reduce, h[64]\n"
"			int bIdx = GET_LOCAL_IDX/32;\n"
"			int eIdx = GET_LOCAL_IDX%32;\n"
"			int lIdx = eIdx + bIdx*64;\n"
"			{\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+1] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+2] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+4] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+8] );\n"
"				mem_fence( CLK_LOCAL_MEM_FENCE );\n"
"				h[lIdx] = max2( h[lIdx], h[lIdx+16] );\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"	}\n"
"	__local int* a = (__local int*)h;\n"
"	{\n"
"		GROUP_LDS_BARRIER;\n"
"		\n"
"		float4 selection;\n"
"\n"
"		int bIdx = GET_LOCAL_IDX/32;\n"
"		int eIdx = GET_LOCAL_IDX%32;\n"
"\n"
"		if( eIdx < 4 )\n"
"		{\n"
"			int idx = (int)a[eIdx+64*4*bIdx] & 0xff;\n"
"			selection = p0[idx+32*bIdx];\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"		if( eIdx < 4 ) p0[eIdx+32*bIdx] = selection;\n"
"	}\n"
"}\n"
"\n"
"/*\n"
"1. Query Normal\n"
"2. Fill Normal\n"
"3. A->B, B->A\n"
"*/\n"
"\n"
"void testVtx(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
"			__local ShapeDeviceData* shapeAPtr, __local ShapeDeviceData* shapeBPtr,\n"
"			__local int* lNContacts, __local float4* lCPoints)\n"
"{\n"
"	int pIdx = GET_LOCAL_IDX;\n"
"	float4 bodyAPos = bodyAPtr->m_pos;\n"
"	float4 bodyBPos = bodyBPtr->m_pos;\n"
"	Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"	Quaternion bodyBQuat = bodyBPtr->m_quat;\n"
"	while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )\n"
"	{\n"
"		float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx );\n"
"\n"
"		float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n"
"//		Aabb bodyAAabb = bodyAPtr->m_aabb;\n"
"//		if( AabbOverlapsPoint( &bodyAAabb, pInW ) )\n"
"		{\n"
"			float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n"
"\n"
"			float dist = ShapeDataQueryDistance( shapeAPtr, pInA );\n"
"			if( dist < 0.010f )\n"
"			{\n"
"				int dstIdx = atom_add( lNContacts, 1 );\n"
"				if( dstIdx < 32 )\n"
"				{\n"
"					lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n"
"				}\n"
"			}\n"
"		}\n"
"\n"
"		pIdx += GET_GROUP_SIZE;\n"
"	}\n"
"}\n"
"\n"
"void testVtx2(__local const BodyData* bodyA, __local const BodyData* bodyB,\n"
"			__local const ShapeDeviceData* shapeA, __local const ShapeDeviceData* shapeB,\n"
"			__local int* lNContactsA, __local float4* lCPointsA,\n"
"			__local int* lNContactsB, __local float4* lCPointsB, float collisionMargin )\n"
"{\n"
"	int pIdx = GET_LOCAL_IDX;\n"
"\n"
"	while( pIdx < HEIGHT_RES*HEIGHT_RES*6*2 )\n"
"	{\n"
"		__local const BodyData* bodyAPtr			=( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyA:bodyB;\n"
"		__local const BodyData* bodyBPtr			=( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?bodyB:bodyA;\n"
"		__local const ShapeDeviceData* shapeAPtr	=( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeA:shapeB;\n"
"		__local const ShapeDeviceData* shapeBPtr	=( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?shapeB:shapeA;\n"
"		__local int* lNContacts				=( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lNContactsA:lNContactsB;\n"
"		__local float4* lCPoints			=( pIdx < HEIGHT_RES*HEIGHT_RES*6 )?lCPointsA:lCPointsB;\n"
"\n"
"		float4 bodyAPos = bodyAPtr->m_pos;\n"
"		float4 bodyBPos = bodyBPtr->m_pos;\n"
"		Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"		Quaternion bodyBQuat = bodyBPtr->m_quat;\n"
"\n"
"		float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );\n"
"\n"
"		float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n"
"//		Aabb bodyAAabb = bodyAPtr->m_aabb;\n"
"//		if( AabbOverlapsPoint( &bodyAAabb, pInW ) )\n"
"		{\n"
"			float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n"
"\n"
"			float dist = ShapeDataQueryDistance( shapeAPtr, pInA );\n"
"			if( dist < collisionMargin )\n"
"			{\n"
"				int dstIdx = atom_add( lNContacts, 1 );\n"
"				if( dstIdx < 32 )\n"
"				{\n"
"					lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n"
"				}\n"
"			}\n"
"		}\n"
"\n"
"		pIdx += GET_GROUP_SIZE;\n"
"	}\n"
"}\n"
"\n"
"void testVtxWithPlane(__local BodyData* bodyA, __local BodyData* bodyB,\n"
"			float4 nA, __local ShapeDeviceData* shapeB,\n"
"			__local int* lNContactsA, __local float4* lCPointsA, float collisionMargin)\n"
"{\n"
"	int pIdx = GET_LOCAL_IDX;\n"
"\n"
"	while( pIdx < HEIGHT_RES*HEIGHT_RES*6 )\n"
"	{\n"
"		__local BodyData* bodyAPtr			=bodyA;\n"
"		__local BodyData* bodyBPtr			=bodyB;\n"
"		__local ShapeDeviceData* shapeBPtr	=shapeB;\n"
"		__local int* lNContacts				=lNContactsA;\n"
"		__local float4* lCPoints				=lCPointsA;\n"
"\n"
"		float4 bodyAPos = bodyAPtr->m_pos;\n"
"		float4 bodyBPos = bodyBPtr->m_pos;\n"
"		Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"		Quaternion bodyBQuat = bodyBPtr->m_quat;\n"
"\n"
"		float4 pInB = ShapeDataCalcSamplePoint( shapeBPtr, pIdx%(HEIGHT_RES*HEIGHT_RES*6) );\n"
"\n"
"		float4 pInW = transform( &pInB, &bodyBPos, &bodyBQuat );\n"
"		{\n"
"			float4 pInA = invTransform( &pInW, &bodyAPos, &bodyAQuat );\n"
"\n"
"			float dist = dot3w1( pInA, nA );//ShapeDataQueryDistance( shapeAPtr, pInA );\n"
"			if( dist < collisionMargin )\n"
"			{\n"
"				int dstIdx = atom_add( lNContacts, 1 );\n"
"				if( dstIdx < 32 )\n"
"				{\n"
"					lCPoints[ dstIdx ] = make_float4( pInA.x, pInA.y, pInA.z, dist );\n"
"				}\n"
"			}\n"
"		}\n"
"\n"
"		pIdx += GET_GROUP_SIZE;\n"
"	}\n"
"}\n"
"\n"
"#define GET_SHAPE_IDX(x) (int)((x).m_shapeIdx)\n"
"\n"
"void output(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
"			__local int2* iPair,\n"
"			__local int* lNContacts, __local float4* lCPoints,\n"
"			float4 center, \n"
"			__global ShapeData* shapeData, __global Contact4* contactsOut, float collisionMargin)\n"
"{\n"
"	if( *lNContacts != 0 )\n"
"	{\n"
"		int nContacts = min2( *lNContacts, 4 );\n"
"\n"
"		__global Contact4* c = contactsOut;\n"
"\n"
"		if( GET_LOCAL_IDX < nContacts )\n"
"		{\n"
"			int i = GET_LOCAL_IDX;\n"
"			float4 p = lCPoints[i];\n"
"			float4 bodyAPos = bodyAPtr->m_pos;\n"
"			Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
"			c->m_worldPos[i] = transform( &p, &bodyAPos, &bodyAQuat );\n"
"			c->m_worldPos[i].w = lCPoints[i].w - collisionMargin;\n"
"		}\n"
"\n"
"		if( GET_LOCAL_IDX == 0 )\n"
"		{\n"
"			float4 contactNormal;\n"
"			contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
"			contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
"			c->m_worldNormal = contactNormal;\n"
"//			c->m_restituitionCoeff = 0.f;\n"
"//			c->m_frictionCoeff = 0.7f;\n"
"			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
"			GET_NPOINTS(*c) = nContacts;\n"
"			c->m_bodyAPtr = iPair[0].x;\n"
"			c->m_bodyBPtr = iPair[0].y;\n"
"		}\n"
"	}\n"
"	else\n"
"	{\n"
"		if( GET_LOCAL_IDX == 0 )\n"
"			GET_NPOINTS(contactsOut[0]) = 0;\n"
"	}\n"
"}\n"
"\n"
"//	todo. make it better\n"
"void output2(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
"			int pair0, int pair1,\n"
"			__local int* lNContacts, __local float4* lCPoints,\n"
"			float4 center, \n"
"			const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity,\n"
"			float collisionMargin )\n"
"{\n"
"	int lIdx = GET_LOCAL_IDX%32;\n"
"	int nContacts = min2( *lNContacts, 4 );\n"
"	\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	if( lIdx == 0 && nContacts)\n"
"	{\n"
"		int dstIdx;\n"
"		AppendInc( nContactsOut, dstIdx );\n"
"		*lNContacts = dstIdx;\n"
"\n"
"		if( dstIdx >= capacity )\n"
"			*lNContacts = -1;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	bool canWrite = (*lNContacts!=-1);\n"
"\n"
"	if( nContacts && canWrite )\n"
"	{\n"
"		__global Contact4* c = contactsOut + (*lNContacts);\n"
"\n"
"		if( lIdx < nContacts )\n"
"		{\n"
"			int i = lIdx;\n"
"			float4 p = lCPoints[i];\n"
"			float4 bodyAPos = bodyAPtr->m_pos;\n"
"			Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
"			p = transform( &p, &bodyAPos, &bodyAQuat );\n"
"			p.w = lCPoints[i].w - collisionMargin;\n"
"			c->m_worldPos[i] = p;\n"
"		}\n"
"\n"
"		if( lIdx == 0 )\n"
"		{\n"
"			if( nContacts )\n"
"			{\n"
"				float4 contactNormal;\n"
"				contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
"				contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
"				c->m_worldNormal = contactNormal;\n"
"//				c->m_restituitionCoeff = 0.f;\n"
"//				c->m_frictionCoeff = 0.7f;\n"
"				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
"				c->m_bodyAPtr = pair0;\n"
"				c->m_bodyBPtr = pair1;\n"
"			}\n"
"			GET_NPOINTS(*c) = nContacts;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"__inline\n"
"void output2LDS(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
"			int pair0, int pair1,\n"
"			int lNContacts, __local float4* lCPoints,\n"
"			float4 center, \n"
"			const __global ShapeData* shapeData, __local Contact4* contactsOut,\n"
"			float collisionMargin )\n"
"{\n"
"	int lIdx = GET_LOCAL_IDX%32;\n"
"//	int lIdx = GET_LOCAL_IDX;\n"
"//	int groupIdx = 0;\n"
"\n"
"	int nContacts = min2( lNContacts, 4 );\n"
"	\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	if( nContacts != 0  )\n"
"	{\n"
"		if( lIdx < nContacts )\n"
"		{\n"
"			int i = lIdx;\n"
"			float4 p = lCPoints[i];\n"
"			float4 bodyAPos = bodyAPtr->m_pos;\n"
"			Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
"			p = transform( &p, &bodyAPos, &bodyAQuat );\n"
"			p.w = lCPoints[i].w - collisionMargin;\n"
"			contactsOut->m_worldPos[i] = p;\n"
"		}\n"
"	}\n"
"\n"
"	if( lIdx == 0 )\n"
"	{\n"
"		if( nContacts != 0 )\n"
"		{\n"
"			float4 contactNormal;\n"
"			contactNormal = ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
"			contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
"			contactsOut->m_worldNormal = contactNormal;\n"
"//			contactsOut->m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);\n"
"//			contactsOut->m_restituitionCoeff = 0.f;\n"
"//			contactsOut->m_frictionCoeff = 0.7f;\n"
"			contactsOut->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
"			contactsOut->m_bodyAPtr = pair0;\n"
"			contactsOut->m_bodyBPtr = pair1;\n"
"		}\n"
"		GET_NPOINTS(*contactsOut) = nContacts;//nContacts;\n"
"	}\n"
"\n"
"//	contactsOut[groupIdx].m_worldNormal = make_float4(1.5f,1.4f,1.3f,0.f);\n"
"}\n"
"\n"
"void output2_1(__local BodyData* bodyAPtr, __local BodyData* bodyBPtr,\n"
"			int pair0, int pair1,\n"
"			__local int* lNContacts, __local float4* lCPoints,\n"
"			float4 center, float4 nA, \n"
"			const __global ShapeData* shapeData, __global Contact4* contactsOut, counter32_t nContactsOut, int capacity, float collisionMargin )\n"
"{\n"
"	int lIdx = GET_LOCAL_IDX;\n"
"	int nContacts = min2( *lNContacts, 4 );\n"
"	\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	if( lIdx == 0 && nContacts)\n"
"	{\n"
"		int dstIdx;\n"
"		AppendInc( nContactsOut, dstIdx );\n"
"		*lNContacts = dstIdx;\n"
"\n"
"		if( dstIdx >= capacity )\n"
"			*lNContacts = -1;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	bool canWrite = (*lNContacts!=-1);\n"
"\n"
"	if( nContacts && canWrite )\n"
"	{\n"
"		__global Contact4* c = contactsOut + (*lNContacts);\n"
"\n"
"		if( lIdx < nContacts )\n"
"		{\n"
"			int i = lIdx;\n"
"			float4 p = lCPoints[i];\n"
"			float4 bodyAPos = bodyAPtr->m_pos;\n"
"			Quaternion bodyAQuat = bodyAPtr->m_quat;\n"
"\n"
"			p = transform( &p, &bodyAPos, &bodyAQuat );\n"
"			p.w = lCPoints[i].w - collisionMargin;\n"
"			c->m_worldPos[i] = p;\n"
"		}\n"
"\n"
"		if( lIdx == 0 )\n"
"		{\n"
"			if( nContacts )\n"
"			{\n"
"				float4 contactNormal;\n"
"				contactNormal = nA;//ShapeDataQueryNormal( &shapeData[GET_SHAPE_IDX(*bodyAPtr)], center );\n"
"				contactNormal = normalize3( qtRotate( bodyAPtr->m_quat, contactNormal ) );\n"
"\n"
"				c->m_worldNormal = contactNormal;\n"
"//				c->m_restituitionCoeff = 0.f;\n"
"//				c->m_frictionCoeff = 0.7f;\n"
"				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
"				c->m_bodyAPtr = pair0;\n"
"				c->m_bodyBPtr = pair1;\n"
"			}\n"
"			GET_NPOINTS(*c) = nContacts;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"__kernel\n"
"void manifold(__global float4* vIn, __global float4* vOut)\n"
"{\n"
"	__local float4 lCPoints[32];\n"
"	__local float4 lManifoldBuffer[64];\n"
"	__local int lNContacts;\n"
"	__local float4 ab;\n"
"\n"
"	if( GET_LOCAL_IDX<32 )\n"
"	{\n"
"		lCPoints[GET_LOCAL_IDX] = vIn[GET_GLOBAL_IDX];\n"
"	}\n"
"\n"
"	if( GET_LOCAL_IDX == 0 ) \n"
"	{\n"
"		lNContacts = 32;\n"
"		ab = vIn[GET_GLOBAL_IDX];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	float4 center = extractManifold( lCPoints, lManifoldBuffer, &lNContacts, ab );\n"
"\n"
"	if( GET_LOCAL_IDX < lNContacts )\n"
"	{\n"
"		vOut[4*GET_GROUP_IDX+GET_LOCAL_IDX] = lCPoints[GET_LOCAL_IDX];\n"
"	}\n"
"\n"
"}\n"
"\n"
"//#define COMBINE_REDUCTION \n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64, 1, 1)))\n"
"void NarrowphaseKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas, \n"
"					   __global Contact4* restrict contactsOut,\n"
"					   counter32_t nContactsOut, ConstBuffer cb ) \n"
"{\n"
"	//	2.5K LDS\n"
"	__local Contact4 ldsContacts[2];\n"
"	__local BodyData bodyA;\n"
"	__local BodyData bodyB;\n"
"	__local ShapeDeviceData shapeA;\n"
"	__local ShapeDeviceData shapeB;\n"
"	__local float4 lCPointsA[32*2];\n"
"	__local int lNContactsA;\n"
"	__local float4* lCPointsB = lCPointsA+32;\n"
"	__local int lNContactsB;\n"
"#ifdef COMBINE_REDUCTION\n"
"	__local float4 lManifoldBuffer[64*2];\n"
"#else\n"
"	__local float4 lManifoldBuffer[64];\n"
"#endif\n"
"	__local int2 iPairAB;\n"
"\n"
"	const int capacity = cb.m_capacity;\n"
"	const float collisionMargin = cb.m_collisionMargin;\n"
"\n"
"\n"
"	int pairIdx = GET_GROUP_IDX;\n"
"//	for(int pairIdx = GET_GROUP_IDX; pairIdx<nPairs; pairIdx+=GET_NUM_GROUPS)\n"
"	{\n"
"		if( GET_LOCAL_IDX == 0 )	//	load Bodies\n"
"		{\n"
"			int2 pair = pairs[pairIdx];\n"
"			iPairAB = make_int2(pair.x, pair.y);\n"
"			bodyA = bodyDatas[ pair.x ];\n"
"			bodyB = bodyDatas[ pair.y ];\n"
"			shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;\n"
"			shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;\n"
"			lNContactsA = 0;\n"
"			lNContactsB = 0;\n"
"		}\n"
"		\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		//	todo. can check if the shape is the same to previous one. If same, dont read\n"
"		{	//	load shape data\n"
"			int idx = GET_LOCAL_IDX%32;\n"
"			int bIdx = GET_LOCAL_IDX/32;\n"
"			__local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;\n"
"			int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);\n"
"\n"
"			while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )\n"
"			{\n"
"				myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];\n"
"\n"
"				idx+=32;\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB, collisionMargin );\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		float4 ab = bodyB.m_pos - bodyA.m_pos;\n"
"		float4 center[2];\n"
"\n"
"		if( lNContactsA != 0 || lNContactsB != 0 )\n"
"		{\n"
"			float4 abInA;\n"
"			abInA = qtInvRotate( bodyA.m_quat, ab );\n"
"\n"
"			float4 abInB;\n"
"			abInB = qtInvRotate( bodyB.m_quat, ab );\n"
"\n"
"#ifdef COMBINE_REDUCTION\n"
"			extractManifold2( lCPointsA, &lNContactsA, abInA,\n"
"				lCPointsB, &lNContactsB, abInB,\n"
"				lManifoldBuffer, center );\n"
"#else\n"
"			if( lNContactsA != 0 )\n"
"				center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );\n"
"			if(  lNContactsB != 0 )\n"
"				center[1] = extractManifold( lCPointsB, lManifoldBuffer, &lNContactsB, abInB );\n"
"#endif\n"
"		}\n"
"\n"
"		int firstSet = GET_LOCAL_IDX/32;\n"
"\n"
"/*\n"
"		if( GET_LOCAL_IDX == 0 )	//	for debug\n"
"		{\n"
"			ldsContacts[0].m_worldNormal = make_float4(-1,-1,-1,0);\n"
"			ldsContacts[0].m_bodyAPtr = 0;\n"
"			ldsContacts[0].m_bodyBPtr = 0;\n"
"			ldsContacts[0].m_batchIdx = 111;\n"
"			ldsContacts[1].m_worldNormal = make_float4(-1,-1,-1,0);\n"
"			ldsContacts[1].m_bodyAPtr = 0;\n"
"			ldsContacts[1].m_bodyBPtr = 0;\n"
"			ldsContacts[1].m_batchIdx = 111;\n"
"		}\n"
"*/\n"
"		bool doReduction = true;\n"
"		if( doReduction )\n"
"		{\n"
"			GROUP_LDS_BARRIER;\n"
"\n"
"			output2LDS( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA, \n"
"				(firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x, \n"
"				(firstSet)?lNContactsA : lNContactsB, (firstSet)?lCPointsA:lCPointsB, \n"
"				(firstSet)?center[0] : center[1], shapeData, (firstSet)?&ldsContacts[0]: &ldsContacts[1], collisionMargin );\n"
"\n"
"			GROUP_LDS_BARRIER;\n"
"		\n"
"			if( GET_LOCAL_IDX == 0 )\n"
"			{\n"
"				if( lNContactsA && lNContactsB )\n"
"				{\n"
"					float nDotn = dot3F4( ldsContacts[0].m_worldNormal, ldsContacts[1].m_worldNormal );\n"
"					if( nDotn < -(1.f-0.01f) )\n"
"					{\n"
"						if( ldsContacts[0].m_bodyAPtr > ldsContacts[1].m_bodyAPtr )\n"
"							lNContactsA = 0;\n"
"						else\n"
"							lNContactsB = 0;\n"
"					}\n"
"				}\n"
"			}\n"
"		\n"
"			if( GET_LOCAL_IDX == 0 )\n"
"			{\n"
"				int n = lNContactsA;\n"
"				if( n != 0 )\n"
"				{\n"
"					int dstIdx;\n"
"					AppendInc( nContactsOut, dstIdx );\n"
"					if( dstIdx < capacity )\n"
"					{	int idx = 0;\n"
"						contactsOut[ dstIdx ] = ldsContacts[idx];\n"
"						contactsOut[ dstIdx].m_batchIdx = pairIdx;\n"
"					}\n"
"				}\n"
"\n"
"				n = lNContactsB;\n"
"				if( n != 0 )\n"
"				{\n"
"					int dstIdx;\n"
"					AppendInc( nContactsOut, dstIdx );\n"
"					if( dstIdx < capacity )\n"
"					{	int idx = 1;\n"
"						contactsOut[ dstIdx ] = ldsContacts[idx];\n"
"						contactsOut[ dstIdx].m_batchIdx = pairIdx;\n"
"					}\n"
"				}\n"
"			}\n"
"\n"
"			GROUP_LDS_BARRIER;\n"
"		}\n"
"		else\n"
"		{\n"
"			//output2( (firstSet)?&bodyA: &bodyB, (firstSet)?&bodyB : &bodyA, \n"
"			//	(firstSet)?iPairAB.x : iPairAB.y, (firstSet)?iPairAB.y : iPairAB.x, \n"
"			//	(firstSet)?&lNContactsA : &lNContactsB, (firstSet)?lCPointsA:lCPointsB, \n"
"			//	(firstSet)?center[0] : center[1], shapeData, contactsOut, nContactsOut, capacity, collisionMargin );\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64, 1, 1)))\n"
"void NarrowphaseWithPlaneKernel( const __global int2* restrict pairs, const __global ShapeData* shapeData, const __global BodyData* restrict bodyDatas, \n"
"					   __global Contact4* restrict contactsOut,\n"
"					   counter32_t nContactsOut, ConstBuffer cb ) \n"
"{\n"
"	//	2.5K LDS\n"
"	__local BodyData bodyA;\n"
"	__local BodyData bodyB;\n"
"	__local ShapeDeviceData shapeA;\n"
"	__local ShapeDeviceData shapeB;\n"
"	__local float4 lCPointsA[32*2];\n"
"	__local int lNContactsA;\n"
"//	__local float4* lCPointsB = lCPointsA+32;\n"
"//	__local int lNContactsB;\n"
"	__local float4 lManifoldBuffer[64];\n"
"	__local int2 iPairAB;\n"
"\n"
"	const int capacity = cb.m_capacity;\n"
"	const float collisionMargin = cb.m_collisionMargin;\n"
"\n"
"	int pairIdx = GET_GROUP_IDX;\n"
"	{\n"
"		if( GET_LOCAL_IDX == 0 )	//	load Bodies\n"
"		{\n"
"			int2 pair = pairs[pairIdx];\n"
"			iPairAB = make_int2(pair.x, pair.y);\n"
"			bodyA = bodyDatas[ pair.x ];\n"
"			bodyB = bodyDatas[ pair.y ];\n"
"			shapeA.m_scale = shapeData[ GET_SHAPE_IDX(bodyA) ].m_scale;\n"
"			shapeB.m_scale = shapeData[ GET_SHAPE_IDX(bodyB) ].m_scale;\n"
"			lNContactsA = 0;\n"
"//			lNContactsB = 0;\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		if (bodyB.m_invMass == 0.f)\n"
"			return;\n"
"			\n"
"		//	todo. can check if the shape is the same to previous one. If same, dont read\n"
"		{	//	load shape data\n"
"			int idx = GET_LOCAL_IDX%32;\n"
"			int bIdx = GET_LOCAL_IDX/32;\n"
"			__local ShapeDeviceData* myShape = (bIdx==0)?&shapeA: &shapeB;\n"
"			int myShapeIdx = (bIdx==0)?GET_SHAPE_IDX(bodyA): GET_SHAPE_IDX(bodyB);\n"
"\n"
"			while( idx < HEIGHT_RES*HEIGHT_RES*6/4 )\n"
"			{\n"
"				myShape->m_height4[idx] = shapeData[ myShapeIdx ].m_height4[idx];\n"
"\n"
"				idx+=32;\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		float4 nA = make_float4(0,1,0,0);\n"
"\n"
"\n"
"//		testVtx2( &bodyA, &bodyB, &shapeA, &shapeB, &lNContactsA, lCPointsA, &lNContactsB, lCPointsB );\n"
"		testVtxWithPlane( &bodyA, &bodyB, nA, &shapeB, &lNContactsA, lCPointsA, collisionMargin );\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"//		float4 ab = bodyB.m_pos - bodyA.m_pos;\n"
"		float4 center[2];\n"
"\n"
"		if( lNContactsA != 0 )\n"
"		{\n"
"			float4 abInA;\n"
"			abInA = nA;//qtInvRotate( bodyA.m_quat, ab );\n"
"\n"
"			if( lNContactsA != 0 )\n"
"				center[0] = extractManifold( lCPointsA, lManifoldBuffer, &lNContactsA, abInA );\n"
"		}\n"
"\n"
"//		int firstSet = GET_LOCAL_IDX/32;\n"
"\n"
"		output2_1( &bodyA, &bodyB, \n"
"			iPairAB.x, iPairAB.y, \n"
"			&lNContactsA, lCPointsA, \n"
"			center[0], nA, shapeData, contactsOut, nContactsOut, capacity, collisionMargin );\n"
"	}\n"
"}\n"
;
